US Plant Collections Digitization Status

Degree of Digitization, US Plant Collection Data from GBIF

library(MASS) # provides the parcoord() function that automatically builds parallel coordinates chart
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.3.2     ✔ purrr   0.3.2
## ✔ tibble  3.0.2     ✔ dplyr   1.0.0
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# import the GBIF data dump
gbif_us_dataset <- read_csv("../gbif_institutionCode_summmary/gbif_us_institutionCode_MIDS_2020.07.08.csv")
## Parsed with column specification:
## cols(
##   institutionCode = col_character(),
##   total = col_double(),
##   has_collectionCode = col_double(),
##   has_catalogNumber = col_double(),
##   has_speciesKey = col_double(),
##   has_scientificName = col_double(),
##   has_acceptedNameUsage = col_double(),
##   has_locality = col_double(),
##   has_higherGeography = col_double(),
##   has_countryCode = col_double(),
##   has_coordinates = col_double(),
##   has_image = col_double(),
##   has_dateIdentified = col_double(),
##   has_identifiedBy = col_double(),
##   has_recordedBy = col_double(),
##   has_eventDate = col_double()
## )
# Sort columns based on MIDS level
gbif_us_dataset <- gbif_us_dataset %>% relocate(has_catalogNumber, has_collectionCode, has_countryCode, has_speciesKey, has_locality, has_eventDate, has_recordedBy, has_coordinates, has_image, has_identifiedBy, has_dateIdentified, has_higherGeography)
  
# Filter dataset for easier viewing
top_5_gbif_us_dataset <- gbif_us_dataset %>% top_n(5, total)
bottom_5_gbif_us_dataset <- gbif_us_dataset %>% top_n(-5, total)

# Generate percentage of totals as data
percentage_gbif_us_dataset <- gbif_us_dataset[, 1:12] %>%
  mutate(across(everything()), . / gbif_us_dataset$total) 

# Filter percentages
top_5_percentage_gbif_us_dataset <- top_5_gbif_us_dataset[, 1:12] %>%
  mutate(across(everything()), . / top_5_gbif_us_dataset$total) 
bottom_5_percentage_gbif_us_dataset <- bottom_5_gbif_us_dataset[, 1:12] %>%
  mutate(across(everything()), . / bottom_5_gbif_us_dataset$total) 

## NULL

The above plot shows the “all or nothing” approach to digitization of terms (i.e., up and down pattern seen all over the plot). Institutions have either close to 0 or close to 100% coverage for their data per term.

Top 5

mass_parcoord_plot <- generate_mass_parcoord_plot(top_5_gbif_us_dataset, top_5_percentage_gbif_us_dataset)

mass_parcoord_plot
## NULL

Again, it is apparent that different collections use terms differently or place emphasis on digitizing different pieces of information (“all or nothing” use of terms).

Bottom 5

mass_parcoord_plot <- generate_mass_parcoord_plot(bottom_5_gbif_us_dataset, bottom_5_percentage_gbif_us_dataset)

mass_parcoord_plot
## NULL

This plot is hard to decipher due to the data (i.e., when two 1 data points exist side by side, the line is not drawn, and similarly for two 0 data points). Need to fix, if this plot proves useful.

Plotting with the GGally package

I can’t figure out why the above plot is totally messed up… the above 1 data point and below -1 data point don’t exist in the data.. need to investigate further.

Full GBIF dataset

# NEED TO CLEAN THIS CODE UP

gbif_us_dataset <- read_csv("../gbif_institutionCode_summmary/gbif_us_institutionCode_MIDS_2020.07.08.csv")
## Parsed with column specification:
## cols(
##   institutionCode = col_character(),
##   total = col_double(),
##   has_collectionCode = col_double(),
##   has_catalogNumber = col_double(),
##   has_speciesKey = col_double(),
##   has_scientificName = col_double(),
##   has_acceptedNameUsage = col_double(),
##   has_locality = col_double(),
##   has_higherGeography = col_double(),
##   has_countryCode = col_double(),
##   has_coordinates = col_double(),
##   has_image = col_double(),
##   has_dateIdentified = col_double(),
##   has_identifiedBy = col_double(),
##   has_recordedBy = col_double(),
##   has_eventDate = col_double()
## )
generate_ggally_parcoord_plot <- function(gbif_us_dataset) {
  # Sort columns based on MIDS level
  gbif_us_dataset <- gbif_us_dataset %>% relocate(has_catalogNumber, has_collectionCode, has_countryCode, has_speciesKey, has_locality, has_eventDate, has_recordedBy, has_coordinates, has_image, has_identifiedBy, has_dateIdentified, has_higherGeography)
  # Generate percentage of totals as data
  percentage_gbif_us_dataset <- gbif_us_dataset[, 1:12] %>%
    mutate(across(everything()), . / gbif_us_dataset$total) 
  
  # add back in the institutionCode
  percentage_gbif_us_dataset <- cbind(percentage_gbif_us_dataset, gbif_us_dataset$institutionCode)
  colnames(percentage_gbif_us_dataset)[13] <- "institutionCode"
  parcordd_plot <- ggparcoord(data = percentage_gbif_us_dataset, columns = 1:12, scale = "uniminmax", alphaLines = 0.05) + theme_bw()
  return(parcordd_plot)
}

ggally_parcordd_plot <- generate_ggally_parcoord_plot(gbif_us_dataset)
ggally_parcordd_plot

“Up and down” pattern seen again.